EDA is an iterative cycle that helps you understand what your data says. When you do EDA, you:
Generate questions about your data
Search for answers by visualising, transforming, and/or modeling your data
Use what you learn to refine your questions and/or generate new questions
EDA is an important part of any data analysis. You can use EDA to make discoveries about the world; or you can use EDA to ensure the quality of your data, asking questions about whether the data meets your standards or not. (Posit Primers: EDA)
df_dataframe_name <- WDI(indicators = c(name1 = "Indicator Code 1",
name2 = "Indicator Code 2"), extra = TRUE)
Write and read:
write_csv(df_dataframe_name, "data/dataframe_name.csv")
df_dataframe_name <- read_csv("data/dataframe_name.csv")
head(), str(), summary(), and
try df_dataframe_name. See also Environment Tab of
RStudio.
df_dataframe_name |> filter(var == "value")
df_dataframe_name |> filter(var %in% c("value_1", ... , "value_n")
df_dataframe_name |> filter(var != "value")
df_dataframe_name |> distinct(var)
df_dataframe_name |> drop_na(var)
df_dataframe_name |> mutate(var_new = var1 * var2)}
arrange()df_dataframe_name |> arrange(var)
df_dataframe_name |> arrange(desc(var))
Visualizing using ggplot() + geom_*()
What type of variation occurs within my variables?
What type of covariation occurs between my variables?
transformed_data |> ggplot(aes(year, name1)) + geom_line()
transformed_data |> ggplot(aes(year, name2)) + geom_line()
transformed_data |> ggplot(aes(name1, name2)) + geom_point()
transformed_data |> ggplot(aes(name1, name2)) + geom_point() + scale_x_log10()
transformed_data |> ggplot(aes(name1, name2)) + geom_point() +
geom_smooth(method = "lm", se = FALSE)
transformed_data |> ggplot(aes(name1, name2)) + geom_point() +
geom_smooth(method = "lm", se = FALSE) + scale_x_log10()
transformed_data |> ggplot(aes(name1)) + geom_histogram()
categorical_var: factor(year),
income, region
transformed_data |> ggplot(aes(categorical_var, name1)) + geom_boxplot()
library(tidyverse)
library(WDI)
If you have not download WDIcache recently, run the following two code chunks, otherwise start with the third.
wdicache <- WDIcache()
write_rds(wdicache, "data/wdicache.rds")
wdicache <- read_rds("data/wdicache.rds")
List of data and its description
WDIsearch(string = "SE.XPD.TOTL.GD.ZS", field = "indicator", short = FALSE, cache = wdicache)
df_education <- WDI(
indicator = c(expenditure = "SE.XPD.TOTL.GD.ZS",
primary = "SE.PRM.ENRR",
secondary = "SE.SEC.ENRR",
tertiary = "SE.TER.ENRR"),
extra = TRUE, cache = wdicache)
write_csv(df_education, "data/education.csv")
df_education <- read_csv("data/education.csv")
Rows: 16758 Columns: 16── Column specification ───────────────────────────────────────────────────────────
Delimiter: ","
chr (7): country, iso2c, iso3c, region, capital, income, lending
dbl (7): year, expenditure, primary, secondary, tertiary, longitude, latitude
lgl (1): status
date (1): lastupdated
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df_education) or df_education in R
Notebook
df_education
Structure of Data: str(df_education) or
glimpse(df_education)
str(df_education)
spc_tbl_ [16,758 × 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ country : chr [1:16758] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
$ iso2c : chr [1:16758] "AF" "AF" "AF" "AF" ...
$ iso3c : chr [1:16758] "AFG" "AFG" "AFG" "AFG" ...
$ year : num [1:16758] 2012 2008 2009 2004 2011 ...
$ status : logi [1:16758] NA NA NA NA NA NA ...
$ lastupdated: Date[1:16758], format: "2023-12-18" "2023-12-18" ...
$ expenditure: num [1:16758] 2.6 4.38 4.81 NA 3.46 ...
$ primary : num [1:16758] 106.3 103.4 99.4 106.3 100.3 ...
$ secondary : num [1:16758] 54.2 39.1 44.4 19.3 52.2 ...
$ tertiary : num [1:16758] NA NA 4.02 1.39 3.76 ...
$ region : chr [1:16758] "South Asia" "South Asia" "South Asia" "South Asia" ...
$ capital : chr [1:16758] "Kabul" "Kabul" "Kabul" "Kabul" ...
$ longitude : num [1:16758] 69.2 69.2 69.2 69.2 69.2 ...
$ latitude : num [1:16758] 34.5 34.5 34.5 34.5 34.5 ...
$ income : chr [1:16758] "Low income" "Low income" "Low income" "Low income" ...
$ lending : chr [1:16758] "IDA" "IDA" "IDA" "IDA" ...
- attr(*, "spec")=
.. cols(
.. country = col_character(),
.. iso2c = col_character(),
.. iso3c = col_character(),
.. year = col_double(),
.. status = col_logical(),
.. lastupdated = col_date(format = ""),
.. expenditure = col_double(),
.. primary = col_double(),
.. secondary = col_double(),
.. tertiary = col_double(),
.. region = col_character(),
.. capital = col_character(),
.. longitude = col_double(),
.. latitude = col_double(),
.. income = col_character(),
.. lending = col_character()
.. )
- attr(*, "problems")=<externalptr>
df_ed <- df_education |> select(country, iso2c, year, expenditure, primary, secondary, tertiary, region, income, lending)
df_ed
df_ed |> filter(region == "Aggregates") |> distinct(country)
df_ed |> group_by(year) |> summarize(expenditure = sum(!is.na(expenditure)),
primary = sum(!is.na(primary)),
secondary = sum(!is.na(secondary)),
tertiary = sum(!is.na(tertiary))) |>
arrange(desc(year))
df_ed |> filter(country == "Sub-Saharan Africa") |> drop_na(expenditure) |>
ggplot(aes(year, expenditure)) + geom_line()
df_ed |> filter(region == "Sub-Saharan Africa") |>
group_by(year) |> summarize(expenditure = sum(!is.na(expenditure)),
primary = sum(!is.na(primary)),
secondary = sum(!is.na(secondary)),
tertiary = sum(!is.na(tertiary))) |>
arrange(desc(year))
df_ed |> drop_na(expenditure) |> filter(country %in% c("Arab World", "Africa Eastern and Southern", "Africa Western and Central", "Sub-Saharan Africa", "South Asia")) |>
ggplot(aes(year, expenditure, col = country)) + geom_line()
df_ed |> filter(country == "Sub-Saharan Africa") |> drop_na(expenditure) |>
ggplot(aes(year, primary)) + geom_line()
df_ed |> filter(country == "Sub-Saharan Africa") |> drop_na(primary) |>
ggplot() + geom_line(aes(year, primary), col = "red") + geom_line(aes(year, secondary), col = "blue") + geom_line(aes(year, tertiary)) + labs(title = "School Enrollment", y = "% gross")
df |> pivot_longer(cols = c(columns to gather), names_to = "name", values_to = "value")
primary:tertiary from the column primary to the column
tertiary
df_ed_long <- df_ed |> pivot_longer(cols = primary:tertiary, names_to = "levels", values_to = "value")
df_ed_long
Purchasing power parities (PPPs): [R Notebook], [Rmd]
df_gdps_long <- df_gdps |>
pivot_longer(cols = c("gdp_nominal", "gdp_real", "gdp_ppp"),
names_to = "gdp", values_to = "value")
df_ed_long |> drop_na(value) |>
filter(country == "Sub-Saharan Africa") |>
ggplot(aes(year, value, col = levels)) + geom_line()
df_ed_long |> drop_na(value) |>
filter(country %in% c("Sub-Saharan Africa", "South Asia")) |>
ggplot(aes(year, value, col = country, linetype = levels)) + geom_line()
Need map data and use geom_sf
https://www.naturalearthdata.com
Get natural earth world country polygons
CRAN: https://cran.r-project.org/web/packages/rnaturalearth/index.html
Manual: https://cran.r-project.org/web/packages/rnaturalearth/rnaturalearth.pdf
library(rnaturalearth)
library(rnaturalearthdata)
ne_countries(
scale = 110,
type = "countries",
continent = NULL,
country = NULL,
geounit = NULL,
sovereignty = NULL,
returnclass = c("sp", "sf")
)
scale: scale of map to return, one of 110, 50, 10 or ‘small’, ‘medium’, ‘large’
type: country type, one of ‘countries’, ‘map_units’, ‘sovereignty’, ‘tiny_countries’
continent: a character vector of continent names to get countries from.
country: a character vector of country names.
geounit: a character vector of geounit names.
sovereignty: a character vector of sovereignty names.
returnclass: ‘sp’ default or ‘sf’ for Simple Features
ne_countries() %>% ggplot() + geom_sf()
ne_world <- ne_countries(scale = "medium", returnclass = "sf")
str(ne_world)
Classes ‘sf’ and 'data.frame': 241 obs. of 64 variables:
$ scalerank : int 3 1 1 1 1 3 3 1 1 1 ...
$ featurecla: chr "Admin-0 country" "Admin-0 country" "Admin-0 country" "Admin-0 country" ...
$ labelrank : num 5 3 3 6 6 6 6 4 2 6 ...
$ sovereignt: chr "Netherlands" "Afghanistan" "Angola" "United Kingdom" ...
$ sov_a3 : chr "NL1" "AFG" "AGO" "GB1" ...
$ adm0_dif : num 1 0 0 1 0 1 0 0 0 0 ...
$ level : num 2 2 2 2 2 2 2 2 2 2 ...
$ type : chr "Country" "Sovereign country" "Sovereign country" "Dependency" ...
$ admin : chr "Aruba" "Afghanistan" "Angola" "Anguilla" ...
$ adm0_a3 : chr "ABW" "AFG" "AGO" "AIA" ...
$ geou_dif : num 0 0 0 0 0 0 0 0 0 0 ...
$ geounit : chr "Aruba" "Afghanistan" "Angola" "Anguilla" ...
$ gu_a3 : chr "ABW" "AFG" "AGO" "AIA" ...
$ su_dif : num 0 0 0 0 0 0 0 0 0 0 ...
$ subunit : chr "Aruba" "Afghanistan" "Angola" "Anguilla" ...
$ su_a3 : chr "ABW" "AFG" "AGO" "AIA" ...
$ brk_diff : num 0 0 0 0 0 0 0 0 0 0 ...
$ name : chr "Aruba" "Afghanistan" "Angola" "Anguilla" ...
$ name_long : chr "Aruba" "Afghanistan" "Angola" "Anguilla" ...
$ brk_a3 : chr "ABW" "AFG" "AGO" "AIA" ...
$ brk_name : chr "Aruba" "Afghanistan" "Angola" "Anguilla" ...
$ brk_group : chr NA NA NA NA ...
$ abbrev : chr "Aruba" "Afg." "Ang." "Ang." ...
$ postal : chr "AW" "AF" "AO" "AI" ...
$ formal_en : chr "Aruba" "Islamic State of Afghanistan" "People's Republic of Angola" NA ...
$ formal_fr : chr NA NA NA NA ...
$ note_adm0 : chr "Neth." NA NA "U.K." ...
$ note_brk : chr NA NA NA NA ...
$ name_sort : chr "Aruba" "Afghanistan" "Angola" "Anguilla" ...
$ name_alt : chr NA NA NA NA ...
$ mapcolor7 : num 4 5 3 6 1 4 1 2 3 3 ...
$ mapcolor8 : num 2 6 2 6 4 1 4 1 1 1 ...
$ mapcolor9 : num 2 8 6 6 1 4 1 3 3 2 ...
$ mapcolor13: num 9 7 1 3 6 6 8 3 13 10 ...
$ pop_est : num 103065 28400000 12799293 14436 3639453 ...
$ gdp_md_est: num 2258 22270 110300 109 21810 ...
$ pop_year : num NA NA NA NA NA NA NA NA NA NA ...
$ lastcensus: num 2010 1979 1970 NA 2001 ...
$ gdp_year : num NA NA NA NA NA NA NA NA NA NA ...
$ economy : chr "6. Developing region" "7. Least developed region" "7. Least developed region" "6. Developing region" ...
$ income_grp: chr "2. High income: nonOECD" "5. Low income" "3. Upper middle income" "3. Upper middle income" ...
$ wikipedia : num NA NA NA NA NA NA NA NA NA NA ...
$ fips_10 : chr NA NA NA NA ...
$ iso_a2 : chr "AW" "AF" "AO" "AI" ...
$ iso_a3 : chr "ABW" "AFG" "AGO" "AIA" ...
$ iso_n3 : chr "533" "004" "024" "660" ...
$ un_a3 : chr "533" "004" "024" "660" ...
$ wb_a2 : chr "AW" "AF" "AO" NA ...
$ wb_a3 : chr "ABW" "AFG" "AGO" NA ...
$ woe_id : num NA NA NA NA NA NA NA NA NA NA ...
$ adm0_a3_is: chr "ABW" "AFG" "AGO" "AIA" ...
$ adm0_a3_us: chr "ABW" "AFG" "AGO" "AIA" ...
$ adm0_a3_un: num NA NA NA NA NA NA NA NA NA NA ...
$ adm0_a3_wb: num NA NA NA NA NA NA NA NA NA NA ...
$ continent : chr "North America" "Asia" "Africa" "North America" ...
$ region_un : chr "Americas" "Asia" "Africa" "Americas" ...
$ subregion : chr "Caribbean" "Southern Asia" "Middle Africa" "Caribbean" ...
$ region_wb : chr "Latin America & Caribbean" "South Asia" "Sub-Saharan Africa" "Latin America & Caribbean" ...
$ name_len : num 5 11 6 8 7 5 7 20 9 7 ...
$ long_len : num 5 11 6 8 7 13 7 20 9 7 ...
$ abbrev_len: num 5 4 4 4 4 5 4 6 4 4 ...
$ tiny : num 4 NA NA NA NA 5 5 NA NA NA ...
$ homepart : num NA 1 1 NA 1 NA 1 1 1 1 ...
$ geometry :sfc_MULTIPOLYGON of length 241; first list element: List of 1
..$ :List of 1
.. ..$ : num [1:10, 1:2] -69.9 -69.9 -69.9 -70 -70.1 ...
..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
- attr(*, "sf_column")= chr "geometry"
- attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "names")= chr [1:63] "scalerank" "featurecla" "labelrank" "sovereignt" ...
ne_world %>% ggplot() + geom_sf(aes(fill = region_wb))
df_ed_2020 <- df_ed |> filter(year == 2020) |> select(iso2c, expenditure)
ne_ed_2020 <- ne_world |> left_join(df_ed_2020, by = c("wb_a2" = "iso2c"))
ne_ed_2020 |> ggplot() + geom_sf(aes(fill = expenditure))
ne_ed_2020 |> filter(subregion == "South-Eastern Asia") |> ggplot() + geom_sf(aes(fill = expenditure))
df_ed_2020 |> drop_na(expenditure) |> pull() |> range()
[1] 1.371014 13.779448
df_ed_2020 |> drop_na(expenditure) |> ggplot(aes(expenditure)) + geom_histogram(binwidth = 3)
It is possible to use quantile.
df_ed_2020 |> drop_na(expenditure) |> pull(expenditure) |>
quantile(probs = c(0.25,0.5,0.75,1))
25% 50% 75% 100%
3.531991 4.420020 5.338555 13.779448
ne_ed_2020 |> mutate(level = cut(expenditure, breaks = c(0,3,6,9,20), labels = c("0-3","3-6","6-9","9-14"))) |> ggplot() + geom_sf(aes(fill = level)) + labs(title = "Government expenditure on education, total (% of GDP) in 2020")
ne_ed_2020 |> filter(continent == "Africa") |> mutate(level = cut(expenditure, breaks = c(0,3,6,9,20), labels = c("0-3","3-6","6-9","9-14"))) |> ggplot() + geom_sf(aes(fill = level)) + labs(title = "Government expenditure on education, \ntotal (% of GDP) in 2020")
We study the relation between the school enrollment in secondary and tertiary level and the gdp per capita.
WDIsearch(string = "school enrollment.*(% gross)", field = "name", short = FALSE)
Q to AI: What does ‘.*’ mean as a regular expression?
A of Poe Assistant: In regular expressions, the pattern ‘.*’ is a commonly used expression that matches any sequence of characters (including an empty sequence). Here’s what it means:
The dot ‘.’ matches any single character except a newline.
The asterisk ‘*’ is a quantifier that matches zero or more occurrences of the preceding element (in this case, the dot).
Therefore, when ‘.*’ is used as a regular expression, it will match any sequence of characters, regardless of length or content. It’s a way to express a wildcard or a catch-all pattern.
df_sec_ter_gdp <- WDI(
indicator = c(secondary = "SE.SEC.ENRR", tertiary = "SE.TER.ENRR",
gdppcap = "NY.GDP.PCAP.PP.KD"),
extra = TRUE, cache = wdicache)
write_csv(df_sec_ter_gdp, "data/sec_ter_gdp.csv")
df_df_sec_ter_gdp <- read_csv("data/sec_ter_gdp.csv")
Rows: 16758 Columns: 15── Column specification ───────────────────────────────────────────────────────────
Delimiter: ","
chr (7): country, iso2c, iso3c, region, capital, income, lending
dbl (6): year, secondary, tertiary, gdppcap, longitude, latitude
lgl (1): status
date (1): lastupdated
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_sec_ter_gdp
df_sec_ter_gdp |> str()
'data.frame': 16758 obs. of 15 variables:
$ country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
$ iso2c : chr "AF" "AF" "AF" "AF" ...
$ iso3c : chr "AFG" "AFG" "AFG" "AFG" ...
$ year : int 2012 2008 2009 2004 2011 2013 2014 2010 2003 2007 ...
$ status : chr "" "" "" "" ...
$ lastupdated: chr "2023-12-18" "2023-12-18" "2023-12-18" "2023-12-18" ...
$ secondary : num 54.2 39.1 44.4 19.3 52.2 ...
..- attr(*, "label")= chr "School enrollment, secondary (% gross)"
$ tertiary : num NA NA 4.02 1.39 3.76 ...
..- attr(*, "label")= chr "School enrollment, tertiary (% gross)"
$ gdppcap : num 2123 1557 1824 1260 1961 ...
..- attr(*, "label")= chr "GDP per capita, PPP (constant 2017 international $)"
$ region : chr "South Asia" "South Asia" "South Asia" "South Asia" ...
$ capital : chr "Kabul" "Kabul" "Kabul" "Kabul" ...
$ longitude : chr "69.1761" "69.1761" "69.1761" "69.1761" ...
$ latitude : chr "34.5228" "34.5228" "34.5228" "34.5228" ...
$ income : chr "Low income" "Low income" "Low income" "Low income" ...
$ lending : chr "IDA" "IDA" "IDA" "IDA" ...
df_sec_ter_gdp_long <- df_sec_ter_gdp |>
pivot_longer(cols = c(secondary, tertiary)) |>
select(country, iso2c, year, gdppcap, name, value, region, income)
df_sec_ter_gdp_long
COUNTRY <- "World"
df_sec_ter_gdp_long |> filter(country == COUNTRY) |> drop_na(value) |>
ggplot() + geom_line(aes(year, value, col = name))
labs(title = "School enrollment; Secondary and Tertiary")
$title
[1] "School enrollment; Secondary and Tertiary"
attr(,"class")
[1] "labels"
Observations:
INCOME <- c("High income", "Upper middle income","Middle income","Lower middle income","Low & middle income", "Low income")
df_sec_ter_gdp_long |> filter(country %in% INCOME) |> drop_na(value) |>
ggplot(aes(year, value, col = factor(country, levels = INCOME), linetype = name)) + geom_line() + ylim(c(0,110)) +
labs(title = "School enrollment: Secondary and Tertiary",
col = "Incom Levels",
linetype = "School Levels", y = "")
Observations
df_sec_ter_gdp_long |> filter(year == 2020) |> drop_na(value, gdppcap) |>
ggplot(aes(gdppcap, value, col = name)) + geom_point() +
labs(title = "School enrollment: Secondary and Tertiary vs GDP per capita", y = "")
Observations
df_sec_ter_gdp_long |> filter(year == 2020) |> drop_na(value, gdppcap) |>
ggplot(aes(gdppcap, value, col = name)) + geom_point() +
scale_x_log10() +
labs(title = "School enrollment; Secondary and Tertiary vs GDP per capita in log10 scale", y = "")
df_sec_ter_gdp_long |> filter(year == 2020) |> drop_na(value, gdppcap) |>
ggplot(aes(gdppcap, value, col = name)) + geom_point() +
geom_smooth(method = "lm", se = FALSE, formula = 'y~x') +
scale_x_log10() +
labs(title = "School enrollment; Secondary and Tertiary vs GDP per capita in log10 scale", y = "")
df_sec_ter_gdp_long |> filter(year == 2020) |> drop_na(gdppcap, value) |>
filter(name == "tertiary") |>
lm(value~log10(gdppcap), data = _) |> summary()
Call:
lm(formula = value ~ log10(gdppcap), data = filter(drop_na(filter(df_sec_ter_gdp_long,
year == 2020), gdppcap, value), name == "tertiary"))
Residuals:
Min 1Q Median 3Q Max
-70.323 -8.289 -0.605 8.209 83.181
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -155.838 12.831 -12.14 <2e-16 ***
log10(gdppcap) 48.735 3.064 15.90 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 18.36 on 170 degrees of freedom
Multiple R-squared: 0.598, Adjusted R-squared: 0.5957
F-statistic: 252.9 on 1 and 170 DF, p-value: < 2.2e-16
Observations
df_sec_ter_gdp_long |> filter(year == 2020, region != "Aggregates") |> drop_na(value, region) |>
ggplot(aes(name, value, fill = region)) + geom_boxplot() +
labs(title = "Secondary and tertiary school enrollment by region", y = "School enrollment (% gross)", x = "", fill = "") +
theme(legend.position = "top")
Observations
df_sec_ter_gdp_long |> filter(year == 2020, region != "Aggregates") |> drop_na(value, income) |>
ggplot(aes(name, value, fill = factor(income, levels = INCOME))) + geom_boxplot() +
labs(title = "Secondary and tertiary school enrollment by income level", y = "School enrollment (% gross)", x = "", fill = "") +
theme(legend.position = "top")
Observations
The title can be in the title in YAML.
We study …..
library(tidyverse)
library(WDI)
Create data folder if you do not have it under Files.
dir.create("data")
If you do not have wdicache.rds in your data folder,
run the following two code chunks.
wdicache <- WDIcache()
Edit short_name_1, chosen_indicator_1, short_name_2,
chosen_indicator_2, etc. You can edit df_yourdata to a
descriptive name. If you edit df_yourdata , please edit
other parts as well.
df_yourdata <- WDI(
indicator = c(short_name_1 = chosen_indicator_1,
short_name_2 = chosen_indicator_2),
extra = TRUE, cache = wdicache)
write_csv(df_yourdata, "data/yourdata.csv")
df_yourdata <- read_csv("data/yourdata.csv")
df_sec_ter_gdp_long <- df_sec_ter_gdp |>
pivot_longer(cols = c(secondary, tertiary))
df_sec_ter_gdp_long
Observations and Questions:
Observations and Questions:
Observations and Questions:
If possible, create a choropleth map. (a challenge, not required)
Observations and Questions: